# title: "integration_v2"
## author: "Jingwei Song"
## date: "06/30/2023"

use aggr_fltd_v2 (no sperm)

library(SingleCellExperiment)
library(Seurat)
library(tidyverse)
library(Matrix)
library(scales)
library(cowplot)
library(RCurl)

#library(Seurat, lib.loc = "/usr/local/lib/R/site-library")
setwd("~/blue_jsong1/Seurat_Jan_2023/Cell ranger v7 integrate ALL")

# load ACME seurat
#load("~/blue_jsong1/Seurat_Jan_2023/Cell ranger v7 integrate ALL/data/ACME_Filtered_seurat_PC50_dim15_res.0.5.RData")
load("data/ACME_Filtered_seurat_PC50_dim15_res.0.5.RData")
ACME <- filtered_seurat
rm(filtered_seurat)

# load Justin's seurat
#load("~/blue_jsong1/Seurat_Jan_2023/Cell ranger v7 integrate ALL/data/aggr_fltd_PC50_dim15_res.0.5_v2.RData")
load("data/aggr_fltd_PC50_dim15_res.0.5_v2.RData")

# load BR4/BR5 merged
#load("~/blue_jsong1/Seurat_Jan_2023/Cell ranger v7 integrate ALL/data/filtered_seurat_PC50_dims15_res0.5_wo_artefact.RData")
load("data/filtered_seurat_PC50_dims15_res0.5_wo_artefact.RData")
BR_new <- no_C2
rm(no_C2)

# all SCT as active assay
ACME
## An object of class Seurat 
## 38857 features across 17641 samples within 2 assays 
## Active assay: SCT (16823 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, umap
aggr_fltd
## An object of class Seurat 
## 29865 features across 4023 samples within 2 assays 
## Active assay: SCT (14797 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, umap
BR_new
## An object of class Seurat 
## 39206 features across 26366 samples within 2 assays 
## Active assay: SCT (17172 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, umap
#DefaultAssay(filtered_seurat) <- "SCT"
split_seurat <- list(aggr_fltd, ACME, BR_new)

# Select the most variable features to use for integration
integ_features <- SelectIntegrationFeatures(object.list = split_seurat, 
                                            nfeatures = 3000) 

# Prepare the SCT list object for integration
split_seurat <- PrepSCTIntegration(object.list = split_seurat, 
                                   anchor.features = integ_features)

integ_anchors <- FindIntegrationAnchors(object.list = split_seurat, 
                                        normalization.method = "SCT", 
                                        anchor.features = integ_features)

seurat_integrated <- IntegrateData(anchorset = integ_anchors, 
                                   normalization.method = "SCT")
## Warning: Attempting to merge an SCTAssay with another Assay type 
## Converting all to standard Assay objects.

## Warning: Attempting to merge an SCTAssay with another Assay type 
## Converting all to standard Assay objects.
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.1 GiB
seurat_integrated # 53525 cells
## An object of class Seurat 
## 42657 features across 48030 samples within 3 assays 
## Active assay: integrated (3000 features, 3000 variable features)
##  2 other assays present: RNA, SCT
# Run PCA
seurat_integrated <- RunPCA(seurat_integrated, npcs = 50)

# Plot PCA
PCAPlot(seurat_integrated,
        split.by = "sample")  

# Run UMAP
seurat_integrated <- RunUMAP(seurat_integrated, 
                             dims = 1:20,
                             reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
seurat_integrated <- FindNeighbors(seurat_integrated, reduction = "pca", dims = 1:20)
  

seurat_integrated <- FindClusters(seurat_integrated, resolution = c(0.3,0.5))  
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 48030
## Number of edges: 1761532
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9641
## Number of communities: 20
## Elapsed time: 10 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 48030
## Number of edges: 1761532
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9505
## Number of communities: 22
## Elapsed time: 10 seconds
DimPlot(seurat_integrated,label = T) + NoLegend() + NoAxes()

DimPlot(seurat_integrated,label = T, split.by = "sample") + NoLegend() + NoAxes()

colnames(seurat_integrated@meta.data)
##  [1] "orig.ident"             "nCount_RNA"             "nFeature_RNA"          
##  [4] "seq_folder"             "nUMI"                   "nGene"                 
##  [7] "log10GenesPerUMI"       "mitoRatio"              "cells"                 
## [10] "sample"                 "nCount_SCT"             "nFeature_SCT"          
## [13] "SCT_snn_res.0.5"        "seurat_clusters"        "integrated_snn_res.0.3"
## [16] "integrated_snn_res.0.5"
Idents(seurat_integrated) <- "integrated_snn_res.0.3"
DimPlot(seurat_integrated,label = T) + NoLegend() + NoAxes()

# methanol_new has some intermediary cells which could delete
DefaultAssay(seurat_integrated) <- "SCT"
FeaturePlot(seurat_integrated, features = c("HyS0018.100", "HyS0061.60", #HMG, PCNA
                                "HyS0008.263", "HyS0030.203", #Ncol1, Nematocilin
                                "HyS0041.99", "HyS0004.446", #chitinase,mucs
                                "HyS0085.53", "HyS0013.338", #ELAV, Rfamide
                                "HyS0017.253", "HyS0010.323",#Wnt5, Pitx
                                "HyS0001.667", "HyS0011.54"), 
            order = T, min.cutoff = 'q5', max.cutoff = 'q95')
## Warning in FeaturePlot(seurat_integrated, features = c("HyS0018.100",
## "HyS0061.60", : All cells have the same value (0.693147180559945) of
## HyS0001.667.

FeaturePlot(seurat_integrated, features = c("HyS0041.99", "HyS0004.446")) #gland cell

FeaturePlot(seurat_integrated, features = c("HyS0085.53", "HyS0013.338")) #neuron

FeaturePlot(seurat_integrated, features = c("HyS0053.57")) # NC unknown

# save(seurat_integrated, file="output/Seurat_integrated_features3k_PC50_dims20_res0.5_v2.RData")

# load 

DimPlot(seurat_integrated,label = T) + NoLegend() + NoAxes()+ DarkTheme()

seurat_integrated@meta.data %>% 
  ggplot(aes(x=sample, fill=sample)) + 
  geom_bar() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust=0.5, face="bold")) +
  ggtitle("NCells")

seurat_integrated@meta.data %>% 
  ggplot(aes(color=sample, x=nGene, fill= sample)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 300)

seurat_integrated@meta.data %>% 
  ggplot(aes(color=sample, x=nUMI, fill= sample)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 300)

seurat_integrated@meta.data %>%
  ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8)

seurat_integrated@meta.data %>% 
  ggplot(aes(color=sample, x=mitoRatio, fill=sample)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 28313 rows containing non-finite values (`stat_density()`).

seurat_integrated@meta.data %>% 
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~sample)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# # nb/cnidocytes
# #HyS0030.203 Nematocilin
# #HyS0042.80 ARSTnd2-like
# #HyS0008.263 Ncol1
# #HyS0004.359 Ncol3
# 
# FeaturePlot(seurat_integrated, features = c("HyS0008.263", "HyS0004.359","HyS0042.80", "HyS0030.203" ), order = T, min.cutoff = 'q10')
# 
# 
# # stem cell
# # HyS0050.7 PIWI
# #HyS0049.34 PIWI-like
# #HyS0061.60 PCNA
# #HyS0098.34 PTER
# FeaturePlot(seurat_integrated, features = c("HyS0050.7", "HyS0049.34", "HyS0061.60", "HyS0098.34"), order = T, min.cutoff = 'q10')
# # only PTER is in the integrated assay, 
# 
# # Raw reads
# DefaultAssay(seurat_integrated) <- "RNA"
# FeaturePlot(seurat_integrated, features = c("HyS0050.7", "HyS0049.34", "HyS0061.60", "HyS0098.34"), 
#             slot = "counts", order = T, min.cutoff = 'q10')
# 
# seurat_integrated <- NormalizeData(seurat_integrated, normalization.method = "LogNormalize")
# 
# # how many cells with piwi expression (>=1?) 
# counts <- GetAssayData(seurat_integrated, slot = "counts")
# counts[1:5, 1:5]
# 
# piwi_counts <- counts["HyS0050.7",]
# summary(piwi_counts)
# nonzero <- piwi_counts > 0
# sum(nonzero) # 3670 cells with nonzero piwi expression
# head(nonzero)
# 
# table(piwi_counts)
# 
# HMG_counts <- counts["HyS0018.100",]
# plot(table((HMG_counts)))

# sessioninfo
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RCurl_1.98-1.12             cowplot_1.1.1              
##  [3] scales_1.2.1                Matrix_1.5-4.1             
##  [5] lubridate_1.9.2             forcats_1.0.0              
##  [7] stringr_1.5.0               dplyr_1.1.2                
##  [9] purrr_1.0.1                 readr_2.1.4                
## [11] tidyr_1.3.0                 tibble_3.2.1               
## [13] ggplot2_3.4.2               tidyverse_2.0.0            
## [15] SeuratObject_4.1.3          Seurat_4.3.0               
## [17] SingleCellExperiment_1.20.1 SummarizedExperiment_1.28.0
## [19] Biobase_2.58.0              GenomicRanges_1.50.2       
## [21] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [23] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [25] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## 
## loaded via a namespace (and not attached):
##   [1] plyr_1.8.8             igraph_1.4.3           lazyeval_0.2.2        
##   [4] sp_1.6-0               splines_4.2.3          listenv_0.9.0         
##   [7] scattermore_1.1        digest_0.6.31          htmltools_0.5.5       
##  [10] fansi_1.0.4            magrittr_2.0.3         tensor_1.5            
##  [13] cluster_2.1.4          ROCR_1.0-11            tzdb_0.4.0            
##  [16] globals_0.16.2         timechange_0.2.0       spatstat.sparse_3.0-1 
##  [19] colorspace_2.1-0       ggrepel_0.9.3          xfun_0.39             
##  [22] jsonlite_1.8.4         progressr_0.13.0       spatstat.data_3.0-1   
##  [25] survival_3.5-5         zoo_1.8-12             glue_1.6.2            
##  [28] polyclip_1.10-4        gtable_0.3.3           zlibbioc_1.44.0       
##  [31] XVector_0.38.0         leiden_0.4.3           DelayedArray_0.24.0   
##  [34] future.apply_1.11.0    abind_1.4-5            DBI_1.1.3             
##  [37] spatstat.random_3.1-5  miniUI_0.1.1.1         Rcpp_1.0.10           
##  [40] viridisLite_0.4.2      xtable_1.8-4           reticulate_1.28       
##  [43] htmlwidgets_1.6.2      httr_1.4.6             RColorBrewer_1.1-3    
##  [46] ellipsis_0.3.2         ica_1.0-3              farver_2.1.1          
##  [49] pkgconfig_2.0.3        sass_0.4.6             uwot_0.1.14           
##  [52] deldir_1.0-9           utf8_1.2.3             labeling_0.4.2        
##  [55] tidyselect_1.2.0       rlang_1.1.1            reshape2_1.4.4        
##  [58] later_1.3.1            munsell_0.5.0          tools_4.2.3           
##  [61] cachem_1.0.8           cli_3.6.1              generics_0.1.3        
##  [64] ggridges_0.5.4         evaluate_0.21          fastmap_1.1.1         
##  [67] yaml_2.3.7             goftest_1.2-3          knitr_1.43            
##  [70] fitdistrplus_1.1-11    RANN_2.6.1             pbapply_1.7-0         
##  [73] future_1.32.0          nlme_3.1-162           mime_0.12             
##  [76] compiler_4.2.3         rstudioapi_0.14        plotly_4.10.1         
##  [79] png_0.1-8              spatstat.utils_3.0-3   bslib_0.4.2           
##  [82] stringi_1.7.12         highr_0.10             lattice_0.21-8        
##  [85] vctrs_0.6.2            pillar_1.9.0           lifecycle_1.0.3       
##  [88] spatstat.geom_3.2-1    lmtest_0.9-40          jquerylib_0.1.4       
##  [91] RcppAnnoy_0.0.20       data.table_1.14.8      bitops_1.0-7          
##  [94] irlba_2.3.5.1          httpuv_1.6.11          patchwork_1.1.2       
##  [97] R6_2.5.1               promises_1.2.0.1       KernSmooth_2.23-21    
## [100] gridExtra_2.3          parallelly_1.36.0      codetools_0.2-19      
## [103] MASS_7.3-60            withr_2.5.0            sctransform_0.3.5     
## [106] GenomeInfoDbData_1.2.9 mgcv_1.8-42            parallel_4.2.3        
## [109] hms_1.1.3              grid_4.2.3             rmarkdown_2.21        
## [112] Rtsne_0.16             spatstat.explore_3.2-1 shiny_1.7.4